#!/usr/bin/env python 
# -*- coding:utf-8 -*-


import urllib



def getHtml(url):
    # 获取网页内容
    page = urllib.urlopen(url)
    html = page.read()
    return html


def content(html):
    # 内容分割的标签
    str = '<article class="excerpt excerpt-one">'
    content = html.partition(str)[2]
    str1 = '<div class="pagination"'
    content = content.partition(str1)[0]
    return content # 得到网页的内容


def title(content, beg=0):
    # 思路是利用str.index()和序列的切片
    try:
        title_list = []
        while True:
            num1 = content.index('title', beg)
            num2 = content.index('target="_blank">', num1)
            title_list.append(content[num1:num2])
            beg = num2

    except ValueError:
        return title_list


def get_img(content, beg=0):
    # 匹配图片的url
    # 思路是利用str.index()和序列的切片
    try:
        img_list = []
        while True:
            src1 = content.index('<img src=', beg)
            src2 = content.index('class="thumb"/>', src1)
            img_list.append(content[src1:src2])
            beg = src2

    except ValueError:
        return img_list


def data_out(title,img):
    #写入文本
    with open("D:\worm.text", "a+") as fo:
        fo.write('\n')
        for size in range(0, len(title)):
            # 判断img[size]中存在的是不是一个url
            fo.write(title[size] + '$' + img[size] + '\n')


if __name__=='__main__':
    html = getHtml("https://bh.sb/post/category/main/")
    content = content(html)
    title = title(content)
    img = get_img(content)
    data_out(title,img)